*************************************************
* Purpose -- To clean downloaded QCEW quarterly data
* for analysis
************************************************
clear all
* RUN 00_path_master.do FIRST TO GET FILEPATHS

************************************************
* Keeping data we need, appending and reshaping
************************************************

* Keeping county aggregate employment data and county-wise data for restaurant sector, first naics
* for codes, see - https://www.bls.gov/cew/classifications/aggregation/agg-level-titles.htm
import delimited using "$raw/qcew/quarterly/1990.q1-q4.singlefile.csv", clear

keep if (agglvl_code == 70) | ///
(agglvl_code == 75 & industry_code == "722")
tostring disclosure_code, replace
keep if disclosure_code == ""

forvalues year = 1991/2019 {
	display "Working with `year'"
	preserve
		import delimited using "$raw/qcew/quarterly/`year'.q1-q4.singlefile.csv", clear
		keep if (agglvl_code == 70) | ///
		(agglvl_code == 75 & industry_code == "722")
		
		if `year' <= 2000 {
			tostring disclosure_code, replace
			keep if disclosure_code == ""
		}
		
		tempfile year
		save `year'
	restore
	
	append using `year'
}

gen yq = qofd(dofq(yq(year, qtr)))
format yq %tq

drop if (mi(month1_emplvl) | mi(month2_emplvl) | mi(month3_emplvl) | mi(total_qtrly_wages)) & inrange(own_code,1,4) 
bys area_fips yq agglvl_code: egen own_code_max = max(own_code)

gen qtrly_avg_emplvl = (month1_emplvl + month2_emplvl + month3_emplvl)/3

foreach var in qtrly_estabs qtrly_avg_emplvl total_qtrly_wages {
	replace `var' = . if own_code_max!=5 & agglvl_code == 75
}

sort area_fips yq agglvl_code own_code, stable
collapse (sum) qtrly_estabs qtrly_avg_emplvl total_qtrly_wages (last) disclosure_code year, by(area_fips yq agglvl_code) 


* Adding county names
preserve
	import excel using "$raw/qcew/area-titles-xlsx.xlsx", firstrow clear
	tempfile names
	save `names'
restore

merge m:1 area_fips using `names', assert(2 3) keep(3) nogen

* Destringing area identifiers

g countyfips = real(area_fips)
g statefips = int(real(substr(area_fips,1,2)))

* Saving an intermediate version
save "$raw/qcew/quarterly/county_quarterly_appended.dta", replace

use "$raw/qcew/quarterly/county_quarterly_appended.dta", replace

* Reshaping data to area-year observations
keep statefips countyfips year yq qtrly_estabs qtrly_avg_emplvl total_qtrly_wages agglvl_code disclosure_code area_title
reshape wide qtrly_estabs qtrly_avg_emplvl total_qtrly_wages disclosure_code, i(countyfips yq) j(agglvl_code)

* Cleaning based on Dorn document on county fips changes - https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf
drop if countyfips == 46113 & year == 2015
replace countyfips = 46113 if countyfips == 46102

drop if inlist(statefips,72,78,2,11,15)	
drop if strpos(area_title,"Unknown") != 0
fillin countyfips yq
bys countyfips (area_title): replace area_title = area_title[_N] if mi(area_title)
bys countyfips (statefips): replace statefips = statefips[1] if mi(statefips)
xtset countyfips yq

************************************************
* Merging with population data (only available annually)
************************************************
preserve
	use "$clean/other/county_pop_15_64.dta", clear
	replace countyfips = 8001 if countyfips == 8911
	replace countyfips = 8013 if countyfips == 8912
	replace countyfips = 8059 if countyfips == 8913
	replace countyfips = 8123 if countyfips == 8914
	
	expand 2 if countyfips == 4910 & inrange(year,1990,1993), gen(expanded)
	replace countyfips = 4012 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 0
	replace countyfips = 4027 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 1
	drop expanded
	
	* disp 77158/(77158 + 9045) 
	* we get 0.895
	
	replace pop = 0.895*pop if countyfips == 4027 & inrange(year,1990,1993)
	replace pop = 0.105*pop if countyfips == 4012 & inrange(year,1990,1993)
	replace pop = int(pop) if inlist(countyfips,4012,4027) & inrange(year,1990,1993)
	
	tempfile pop
	save `pop'
restore

merge m:1 countyfips year using `pop', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

************************************************
* Merging with minimum wage data
************************************************
preserve
	use "$raw/other/mw_state_quarterly_vz.dta", clear
	rename quarterly_date yq
	tempfile mw_vz
	save `mw_vz'
restore

merge m:1 statefips yq using `mw_vz', keep(1 3)

bys countyfips (yq): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

rename max_mw mw
drop *_mw

gen fed_mw = .

replace fed_mw = 3.10 if inrange(yq, 80, 83) // 1980
replace fed_mw = 3.35 if inrange(yq, 84, 120) // 1981q1-1990q1 
replace fed_mw = 3.80 if inrange(yq, 121, 124) // 1990q2-1991q1
replace fed_mw = 4.25 if inrange(yq, 125, 146) // 1991q2-1996q3
replace fed_mw = 4.75 if inrange(yq, 147, 149) // 1996q4-1997q2
replace fed_mw = 5.15 if inrange(yq, 150, 189) // 1997q3-2007q2
replace fed_mw = 5.85 if inrange(yq, 190, 193) // 2007q3-2008q2
replace fed_mw = 6.55 if inrange(yq, 194, 197) // 2008q3-2009q2
replace fed_mw = 7.25 if yq >= 198 // 2009q3-

// source - https://www.dol.gov/agencies/whd/minimum-wage/history/chart

************************************************
* Merging with price index
************************************************

preserve
	import excel using "$raw/other/r-cpi-u-rs-allitems.xlsx", cellrange(A6) firstrow clear
	rename YEAR - DEC, lower
	keep year - dec
	rename (jan - dec) cpi#, addnumber
	reshape long cpi, i(year) j(month)
	gen yq = qofd(dofm(ym(year, month)))
	format %tq yq
	keep if inrange(year,1990,2019)
	
	* Collapse to quarterly level
	collapse (mean) cpi, by(yq)
	
	* Reindex to have 2023q1 as 100
	gen index = (cpi/443.23)*100 // 443.23 is index for 2023q1
	replace index = index/100
	
	drop cpi
	tempfile cpi_rs
	save `cpi_rs'
restore

merge m:1 yq using `cpi_rs', assert(2 3) keep(3) nogen

************************************************
* Merging with CZ data
************************************************

preserve
	use "$raw/jnr-cbp/cw_cty_czone_mod_AADHP.dta", clear
	rename cty_fips countyfips
	tempfile cz
	save `cz'
restore

merge m:1 countyfips using `cz', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean


************************************************
* Balanced panel
************************************************

foreach i in 70 75 {
	gen missing`i' = mi(qtrly_avg_emplvl`i')
	gen suppressed`i' = disclosure_code`i' == "N"
	foreach var in qtrly_estabs`i' qtrly_avg_emplvl`i' total_qtrly_wages`i' {
		replace `var' = . if suppressed`i' == 1
	}
	gen mis_sup`i' = missing`i' == 1 | suppressed`i' == 1
	bys countyfips (yq): egen prop_mis_sup`i' = mean(mis_sup`i')
	gen balanced_full_`i' = prop_mis_sup`i' == 0
	gen balanced_90_`i' = prop_mis_sup`i' <= .1
}

drop _fillin

save "$clean/qcew/qcew_quarterly_clean_county.dta", replace




************************************************
* Yearly version
************************************************

use "$raw/qcew/quarterly/county_quarterly_appended.dta", clear

* Reshaping data to area-quarter observations
keep statefips countyfips year yq qtrly_estabs qtrly_avg_emplvl total_qtrly_wages agglvl_code disclosure_code area_title
reshape wide qtrly_estabs qtrly_avg_emplvl total_qtrly_wages disclosure_code, i(countyfips yq) j(agglvl_code)

* Cleaning based on Dorn document on county fips changes - https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf
drop if countyfips == 46113 & year == 2015
replace countyfips = 46113 if countyfips == 46102

drop if inlist(statefips,72,78,2,11,15)	
drop if strpos(area_title,"Unknown") != 0
fillin countyfips yq
bys countyfips (area_title): replace area_title = area_title[_N] if mi(area_title)
bys countyfips (statefips): replace statefips = statefips[1] if mi(statefips)
xtset countyfips yq


* Collapse to yearly 
sort countyfips year disclosure_code70 disclosure_code75, stable

collapse (mean) annual_avg_estabs70=qtrly_estabs70 ///
	annual_avg_emplvl70=qtrly_avg_emplvl70 ///
	annual_avg_estabs75=qtrly_estabs75 annual_avg_emplvl75=qtrly_avg_emplvl75 ///
	(sum) total_annual_wages70=total_qtrly_wages70 ///
	total_annual_wages75=total_qtrly_wages75 ///
	(first) area_title statefips (last) disclosure_code70 disclosure_code75, by(countyfips year)


************************************************
* Merging with population data
************************************************
preserve
	use "$clean/other/county_pop_15_64.dta", clear
	replace countyfips = 8001 if countyfips == 8911
	replace countyfips = 8013 if countyfips == 8912
	replace countyfips = 8059 if countyfips == 8913
	replace countyfips = 8123 if countyfips == 8914
	
	expand 2 if countyfips == 4910 & inrange(year,1990,1993), gen(expanded)
	replace countyfips = 4012 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 0
	replace countyfips = 4027 if countyfips == 4910 & inrange(year,1990,1993) & expanded == 1
	drop expanded
	
	* disp 77158/(77158 + 9045) 
	* we get 0.895
	
	replace pop = 0.895*pop if countyfips == 4027 & inrange(year,1990,1993)
	replace pop = 0.105*pop if countyfips == 4012 & inrange(year,1990,1993)
	replace pop = int(pop) if inlist(countyfips,4012,4027) & inrange(year,1990,1993)
	
	tempfile pop
	save `pop'
restore

merge 1:1 countyfips year using `pop', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

************************************************
* Merging with minimum wage data
************************************************
merge m:1 statefips year using "$raw/other/mw_state_annual_vz.dta", keep(1 3)

bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

rename max_mw mw
drop *_mw

gen fed_mw = .

replace fed_mw = 3.10 if year == 1980
replace fed_mw = 3.35 if year >= 1981 & year <=1989
replace fed_mw = 3.80 if year == 1990
replace fed_mw = 4.25 if year >=1991 & year<=1995
replace fed_mw = 4.75 if year == 1996
replace fed_mw = 5.15 if year>=1997 & year<=2006
replace fed_mw = 5.85 if year == 2007
replace fed_mw = 6.55 if year == 2008
replace fed_mw = 7.25 if year>=2009

// source - https://www.dol.gov/agencies/whd/minimum-wage/history/chart


************************************************
* Merging with price index
************************************************

preserve

	import excel using "$raw/other/r-cpi-u-rs-allitems.xlsx", cellrange(A6) firstrow clear
	rename YEAR - AVG, lower
	keep year avg
	keep if inrange(year,1990,2019)
	
	* reindex to have 2023 as 100
	gen index = (avg/449.3)*100 //449.3 is index for 2023
	replace index = index/100
	
	drop avg
	tempfile cpi_rs
	save `cpi_rs'
restore

merge m:1 year using `cpi_rs', assert(2 3) keep(3) nogen

************************************************
* Merging with CZ data
************************************************

preserve
	use "$raw/jnr-cbp/cw_cty_czone_mod_AADHP.dta", clear
	rename cty_fips countyfips
	tempfile cz
	save `cz'
restore

merge m:1 countyfips using `cz', keep(1 3)
bys countyfips (year): egen merge_mean = mean(_merge)

keep if merge_mean == 3

drop _merge merge_mean

************************************************
* Balanced panel
************************************************

foreach i in 70 75 {
	gen missing`i' = mi(annual_avg_emplvl`i')
	gen suppressed`i' = disclosure_code`i' == "N"
	foreach var in annual_avg_estabs`i' annual_avg_emplvl`i' total_annual_wages`i' {
		replace `var' = . if suppressed`i' == 1
	}
	gen mis_sup`i' = missing`i' == 1 | suppressed`i' == 1
	bys countyfips (year): egen prop_mis_sup`i' = mean(mis_sup`i')
	gen balanced_full_`i' = prop_mis_sup`i' == 0
	gen balanced_90_`i' = prop_mis_sup`i' <= .1
}

save "$clean/qcew/qcew_quarterly_annual_clean_county.dta", replace


